#install.packages('tidycensus')
#install.packages('position_dodge')
#library(position_dodge)
library(scales)
library(tidycensus)
library(tidyverse)
library(data.table)
library(readxl)
#install.packages('maps')
library(maps)
library(stargazer)
#install.packages('R.utils')
library(ggrepel)
library(R.utils)
library(ggpubr)
#library(arm)
#install.packages("coefplot")
library(coefplot)
#library(dplyr)
library(plotly)
#library(ggcoefstats)
options(tigris_use_cache = TRUE)
This is the dataset which is the output of the data cleaning script.
#redist <- read.csv("./source_data/analysis_data.csv", stringsAsFactors = FALSE)
We worked with three separate data sets for our research project, which needed to be organized and merged to state-election year pairings for our analysis. Single district states were left in the data to be included for certain visualizations, but they would ultimately be dropped for most of the substantive analysis.
This data set began as state-redistricting map pairings, tied to the original census year that maps were based on. A significant complicating factor, however, is that several states have map revisions as a result of court challenges, creating inconsistent numbers of pairings. Fortunately, the data also has columns denoting the years a given map was active, which could be used to associate an individual map with the elections where it was active. Since this data frame would be merged with both of our other data sets, this is also also the step where we added dichotomous dummy variables for use in later analysis.
This was merged in several steps: 1. Filtering out state house and state senate maps, as well as the 2020 congressional maps 2. Filling in blank entries in the “Drawn.by” column with “Institution” 3. Identifying states that have revised maps (specifically FL, NC, OH, PA, RI, TX, VA) 4. Creating an additional data frame for pairing census cycles to election years 5. Doing a full_join on the data frames to create all possible pairings 6. Adding a dummy variable for November 1st of each year as a cutoff point to mark election dates 7. Using the above to create another dummy variable to mark election year-map pairings where a map was inactive and filter them out 8. Adding a dummy variable for which states used more than one redistricting method in the sample window 9. Adding a variety of dichotomous dummy variables to identify different redistricting methods
redistricting_raw <- read.csv("./source_data/StatesAndCyclesData.csv", stringsAsFactors = FALSE)
# 1. Filtering out state house and state senate maps, as well as the 2020 congressional maps
# 2. Filling in blank entries in the "Drawn.by" column with "Institution"
redistricting_raw <- redistricting_raw %>%
filter(Level == "Congress", Cycle.Year < 2020) %>%
mutate(Drawn.by = ifelse(Drawn.by == "", Institution, Drawn.by))
# 3. Identifying states that have revised maps (FL, NC, OH, PA, RI, TX, VA)
problem_states <- redistricting_raw %>%
count(State) %>%
filter(n > 2) %>%
select(State) %>%
pull(State)
# 4. Creating an additional data frame for pairing census cycles to election years
cycle_pairs <- rbind(
data.frame(Cycle.Year = 2000, Year = c(2002, 2004, 2006, 2008, 2010)),
data.frame(Cycle.Year = 2010, Year = c(2012, 2014, 2016, 2018, 2020)))
# 5. Doing a full_join on the data frames to create all possible pairings
# 6. Adding a dummy variable for November 1st of each year as a cutoff point to mark election dates
redistricting_joined <- full_join(redistricting_raw, cycle_pairs, by = "Cycle.Year") %>%
mutate(Election.Cutoff = paste(Year, "-11-01", sep = ""),
Start.Date = ifelse((Start.Date == "") & (Seats == 1), "2001-12-31", Start.Date),
End.Date = ifelse((End.Date == "") & (Seats == 1), "2001-12-31", End.Date))
# 7. Using the above to create another dummy variable to mark election year-map pairings where a map was inactive and filter them out
redistricting <- redistricting_joined %>%
mutate(Inactive = ifelse(!(State %in% problem_states), F,
ifelse(difftime(redistricting_joined$Start.Date, Election.Cutoff) > 0 |
difftime(redistricting_joined$End.Date, Election.Cutoff) < 0, T, F))) %>%
filter(Inactive == FALSE)
# 8. Adding a dummy variable for which states used more than one redistricting method in the sample window
redistricting_changes = redistricting %>%
group_by(State) %>%
summarize(redistricting_methods = n_distinct(Drawn.by)) %>%
mutate(changed = ifelse(redistricting_methods > 1, 1, 0))
redistricting <- left_join(redistricting, redistricting_changes)
# Example showing the successful isolation of map revisions and accurate pairings
redistricting %>%
filter(State %in% problem_states) %>%
select(State, Drawn.by, Year, Start.Date, End.Date, changed)
# 9. Adding a variety of dichotomous dummy variables to identify different redistricting methods for later analysis
redistricting <- redistricting %>%
mutate(legislature = ifelse(Drawn.by == "Legislature", 1, 0),
partisan = ifelse((Drawn.by == "Legislature") |
(Drawn.by == "Politician commission") , 1, 0),
independent_commission = ifelse(Drawn.by == "Independent commission", 1, 0),
court = ifelse(str_detect(tolower(Drawn.by), "court"), 1, 0),
mid_term = ifelse(Year %% 4 == 2, 1, 0))
redistricting %>%
select(Year, State, Drawn.by, changed, legislature, partisan,
independent_commission, court, mid_term)
The MIT election lab data was more granular than necessary for our analysis, containing state-election year-district-candidate observations with vote totals for each candidate. It was aggregated to the district level, pivoted wider to create state-election year-district observations, and then merged with the redistricting data. Third party candidates were grouped as “OTHER” for the aggregation, and Minnesota’s Democratic party was renamed to match the national party.
elec <- read.csv("./source_data/1976-2020-house.csv")
votes <- elec %>%
filter(year >= 2002) %>%
mutate(party = ifelse(party == "REPUBLICAN", "REPUBLICAN",
ifelse(party == "DEMOCRAT", "DEMOCRAT",
ifelse(str_detect(party, "DEMOCRATIC-FARM"), "DEMOCRAT",
"OTHER")))) %>%
group_by(year, state_po, district, party) %>%
summarize(votes = sum(candidatevotes), totalvotes = mean(totalvotes)) %>%
pivot_wider(names_from = party, values_from = votes, values_fill = 0) %>%
left_join(redistricting, by = c("year"="Year", "state_po"="State")) %>%
mutate(dem_margin = 100*(DEMOCRAT-REPUBLICAN)/totalvotes)
write.csv(votes, "votes_redistricting.csv", row.names=FALSE)
votes
This data set contained State-election year-voter Keep only for required span - 2000 to 2020 Drop states with only one Congressional district. Then, convert the party variable to have only three categories - Republican, Democrat and Other. Pivot to wide, and get vote totals and shares.
Census data was available for the current census and historical data (separated). Current data is combined, historical data is by presidential/midterms. We import the relevant cells from the Excel sheets and merge them as needed.
## New names:
## New names:
## New names:
## * `` -> `...1`
## * `` -> `...2`
## * `` -> `...3`
## * `` -> `...4`
## * `` -> `...5`
## * `` -> `...6`
## * `` -> `...7`
## * `` -> `...8`
## * `` -> `...9`
## * `` -> `...10`
## * `` -> `...11`
redistricting <- left_join(redistricting, states,
by = c("State" = "state_po")) %>%
mutate(Cycle.Year = as.numeric(Cycle.Year),
state_full = tolower(state_full))
redist_turn <- left_join(redistricting, all_years_long,
by = c("state_full" = "turnout_state", "Year" = "turnout_year"))
https://rconsortium.github.io/censusguide/r-packages-all.html
https://walker-data.com/tidycensus/articles/spatial-data.html
The following code uses the tidycensus package and the Census API to download data + shapefiles from the US Census.
Declare Census product ( decennial).
Get values for total population in 2010 (just for an example), by state. Get shapefiles by geometry=True
Subset to US mainland; filter out Alaska, Puerto Rico, Hawaii. Convert state names to lowercase
`################
#Declare parameters - API key, variables, time
census_api_key("2f1473c692f61175605ea04cbe2a9a1b41d5bf7c")
yr_select <- c(2010)
#call DF
stpop <- get_decennial(geography = "state",
variables = "P003001",
geometry = TRUE,
year = yr_select) %>% select(-variable) %>%
filter(NAME!="Puerto Rico", NAME!="Alaska", NAME!="Hawaii" )
stpop$NAME <- tolower(stpop$NAME)
###############################
Create a spatial dataframe by merging both datasets.
election_map<-merge(stpop, redist_turn, by.x = "NAME", by.y = "state_full" )
We filter the dataframe to a single year, say, 2012. Then we use ‘ggplot’ with the ‘geom_sf’ function to make a choropleth map, by unordered categorical variable ‘Drawn.by’. This variable includes a string indicating which type of redistricting policy was in effect in a state.
#idvars = c("NAME", "GEOID", "geometry")
# Visualization
##Maps
map2012 <- election_map %>% filter(Year==2012)
## democratic votes
redist_map <- map2012 %>%
ggplot() +
geom_sf(aes(fill=Drawn.by), color="black", size=0.1) +
geom_sf_text(aes(label = State), family = "Times", size = 2.6,
color = "black", alpha = 0.9, hjust = 0.5, vjust = 0.5,
label.padding = unit(0, "pt")) +
labs(title = "Redistricting policies in 2012")+
theme(plot.title =element_text(hjust=0.5, face="bold"), #moves title to the middle of graph, bolds it
text= element_text(family= "Times", size = )) +
scale_fill_manual(values = c("#80b1d3", "#ffffb3", "#98FB98", "#fb8072", "#8dd3c7" , "#436EEE"))
redist_map
#ggsave("./images/redist_map_2012.png", redist_map)
The following makes a boxplot depicting level of turnout for all elections, categorised by the state’s redistricting policy.
#Filter out single district states, and only till 2021.
redist2 <- redist_turn %>% filter(Drawn.by!="n/a") %>%
filter(Seats != 1,
Year < 2022)
# Boxplot
##Make a smaller DF of average turnout by redistricting policy
turnout_by_drawnby <- redist2 %>%
group_by(Drawn.by) %>%
summarise(turnout_perc = mean(turnout_perc))
#now make boxplot
#TODO: add coloring same as US map coloring
boxplt<- redist2 %>%
ggplot(aes(x = Drawn.by, y = turnout_perc)) +
geom_boxplot() +
geom_text(data = turnout_by_drawnby,
aes(x = Drawn.by, y = turnout_perc, label = round(turnout_perc, 0)),
color = "red",
size = 3,
hjust = 0.5,
vjust = 0.5
# position = position_dodge(width=1)
) +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip() +
labs(title = "Avg. Turnout from 2000-2020, by Category",
x = "",
y = "Percent of eligible voters who voted") +
theme(plot.title = element_text(hjust = 0.5),
axis.title = element_text(face =))
#ggsave("./images/boxplot.png", boxplt)
boxplt
### By state
election_map %>%
ggplot(aes(x=Year, y = turnout_perc, colour = State)) +
geom_line(size = 0.5) +
labs(title = "Dem and Rep voter turnout from 2000-2020", x = "year", y = "percentage of vote by party") +
theme(plot.title = element_text(hjust = 0.5))
# Aggregate to institution
inst_lineplt <- election_map %>% group_by(Drawn.by, Year) %>% summarise(avg_turnout = mean(turnout_perc)) %>% filter(Drawn.by!="n/a")
lesscom <- inst_lineplt %>% filter(Drawn.by %in% c("State court", "Federal court", "Legislature", "Independent commission"))
lineplt <- lesscom %>%
ggplot(aes(x=Year, y = avg_turnout, colour = Drawn.by)) +
geom_line(size = .7) +
labs(title = "Avg. Turnout percentage 2000-2020, by redistricting ", x = "year", y = "Turnout percentage") +
theme(plot.title = element_text(hjust = 0.5))
lineplt
#ggsave("./images/lineplt_trn.html", lineplt)
We run three regression models with the same basic model.
Our main independent variable derives from the ‘Institution’ or ‘Drawn.By’ columns. For the first set of OLS results, we run three models.
\(y_{i} = \beta_0 + \beta_1 Policy_{i} + \beta_2 Midterm_{i} + \epsilon_{i}\)
turn_reg_all <- lm(turnout_perc ~ Drawn.by + mid_term, data = redist2)
Run the above model without the midterm dummy, separately for midterm elections and presidential elections. We use the ‘mid_term’ indicator variable generated.
\(y_{i} = \beta_0 + \beta_1 Policy_{i} + \epsilon_{i}\)
#Subset to midterms, and estimate the model
redist_mid <- redist2 %>% filter(mid_term==1)
Midterm <- lm(turnout_perc ~ Drawn.by+ mid_term, data = redist_mid)
#Subset to Presidential elections, and estimate the model
redist_pres <- redist2 %>% filter(mid_term==0)
Presidential <- lm(turnout_perc ~ Drawn.by + mid_term, data = redist_pres)
We use stargazer to generate a formatted table of the regression coefficients for the three estimates models.
stargazer(turn_reg_all, Midterm, Presidential, type ="html")
| Dependent variable: | |||
| turnout_perc | |||
| (1) | (2) | (3) | |
| Drawn.byFederal court | 2.594 | 2.178 | 3.096 |
| (1.729) | (2.358) | (2.601) | |
| Drawn.byIndependent commission | -0.099 | -0.429 | 0.321 |
| (1.740) | (2.375) | (2.613) | |
| Drawn.byLegislature | 2.019 | 1.883 | 2.253 |
| (1.563) | (2.087) | (2.399) | |
| Drawn.byPolitician commission | -4.606** | -6.170** | -2.945 |
| (1.879) | (2.583) | (2.797) | |
| Drawn.byState court | 3.147* | 3.313 | 3.078 |
| (1.657) | (2.237) | (2.516) | |
| mid_term | -5.138*** | ||
| (0.468) | |||
| Constant | 70.343*** | 65.400*** | 70.050*** |
| (1.559) | (2.042) | (2.364) | |
| Observations | 430 | 215 | 215 |
| R2 | 0.285 | 0.132 | 0.075 |
| Adjusted R2 | 0.275 | 0.111 | 0.053 |
| Residual Std. Error | 4.850 (df = 423) | 5.002 (df = 209) | 4.727 (df = 209) |
| F Statistic | 28.142*** (df = 6; 423) | 6.329*** (df = 5; 209) | 3.379*** (df = 5; 209) |
| Note: | p<0.1; p<0.05; p<0.01 | ||
Use models 2 and 3 above to make a coefficient plot using the ‘multiplot’ package. It shows that the coefficient estimates are always more negative or less positive for midterms, compared to Presidential elections.
# Make a coefplot
multiplot(Midterm, Presidential, intercept=FALSE)
Estimating OLS models with panel data may lead to biased and imprecise estimates. The reasons for this:
Unobserved heterogeneity across states, which we don’t control for due to omitted variables.
Errors might not be independent, i.e. they may be serially/autocorrelated across time. OLS regression model assumes that the errors are independent and identically distributed, but this assumption may not hold for panel data. When the errors are correlated across time for each district.
We use the following panel regression model:
\(y_{it} = \beta_0 + \beta_1 Policy_{it} + \beta_2 state_i + \beta_3 year_t + \epsilon_{it}\)
Where \(y_{it} =\) Turnout percentage for state i in year t \(x_{it} =\) Binary variable for policy (court, legislature~~ or partisan) \(state_i =\) State fixed effects \(year_t =\) Year fixed effects \(\epsilon_{it} =\) Error term
We use the R plm package
We use the three binary variables generated to indicate the state’s redistricting policy
Get dummies for each category (court, legislature, independent etc)
Make custom variables ‘partisan’=1 or 0, and categorize the above cases into it.
library(plm)
## Warning: package 'plm' was built under R version 4.1.3
##
## Attaching package: 'plm'
## The following object is masked from 'package:data.table':
##
## between
## The following objects are masked from 'package:dplyr':
##
## between, lag, lead
plm1 <- plm(turnout_perc ~ partisan + mid_term, data=redist2, index=c("State", "Year"), model="within", effect = "twoways")
plm2 <- plm(turnout_perc ~ legislature + mid_term, data=redist2, index=c("State", "Year"), model="within")
plm3 <- plm(turnout_perc ~ court, data=redist2, index=c("State", "Year"),
model="within", effect = "twoways")
stargazer(plm1, plm3, type='text',
add.lines=list(c('State fixed effects', 'Yes','Yes'), c('Year fixed effects', 'Yes','Yes')))
====================================================== Dependent
variable:
—————————- turnout_perc
(1) (2)
—————————————————— partisan -0.969**
(0.491)
court 1.260**
(0.514)
State fixed effects Yes Yes
Year fixed effects Yes Yes
Observations 430 430
R2 0.010 0.016
Adjusted R2 -0.126 -0.120
F Statistic (df = 1; 377) 3.893** 6.010**
====================================================== Note:
p<0.1; p<0.05; p<0.01